# Replication code for main BART Models including variable selection

library(bartMachine)
library(foreign)
library(readstata13)
library(ggplot2)

# load the data
nelda4 <- read.dta13("./nelda4b.dta")

nelda4 <- subset(nelda4, multPart1 == 0, select = c(incosWin, year, incRun, incLimit, 
                                                    incSucc, electSuspend, incExtend, 
                                                    manipTime, concCan, 
                                                    oppPrev, oppBoycott, oppHarass,
                                                    mediaBias, econCrisis, recAid,
                                                    violPrior, intMonitor, westMonitor,
                                                    goodUS, westLink, polity2l1, 
                                                    goodGrow, pollExist, incApp, 
                                                    rgdpcgrl, dCPIl1, timeInc2,
                                                    realmarg_1, rgdppcGrow, tranGrowth))

# Note: in new version of BART Machine, 0-1 dummies need to be coded to 1-2 or some results will be reversed.
nelda4$incosWin[nelda4$incosWin == 0] <- 2
train.nelda <- subset(nelda4, year < 2007)
test.nelda <- subset(nelda4, year >= 2007)
# Run thought BART model for variable selection
set.seed(7)
Yfull = as.factor(train.nelda$incosWin)
Xfull = subset(train.nelda, select = c(year, incRun, incLimit, 
                                       incSucc, electSuspend, incExtend, 
                                       manipTime, concCan, 
                                       oppPrev, oppBoycott, oppHarass,
                                       mediaBias, econCrisis, recAid,
                                       violPrior, intMonitor, westMonitor,
                                       goodUS, westLink, polity2l1, 
                                       goodGrow, pollExist, incApp, 
                                       rgdpcgrl, dCPIl1, timeInc2,
                                       realmarg_1, rgdppcGrow, tranGrowth))
bart_machine_train = bartMachineCV(Xfull, Yfull,use_missing_data = TRUE, 
                                   use_missing_data_dummies_as_covars = TRUE)
bart_machine_train



# Investigate variable importance by inclusion properties
svg("./VarImptc.svg")
investigate_var_importance(bart_machine_train, num_replicates_for_avg = 100)
dev.off()

# Principled variable selection
svg("./varselect.svg")
vs = var_selection_by_permute(bart_machine_train)
dev.off()

destroy_bart_machine(bart_machine_train)

# Now that we have a pretty good idea what the main variables are, run the training set again
# Note: We omit missing variables at this stage so the function for the direct effects will work properly
set.seed(7)
train.nelda2 <- subset(train.nelda, select = c("incosWin","polity2l1","incApp","incRun",
                                               "concCan","recAid","goodUS",
                                               "timeInc2","rgdppcGrow","pollExist"))
train.nelda2 <- na.omit(train.nelda2)
Yfull = as.factor(train.nelda2$incosWin)
Xfull = subset(train.nelda2, select = c(polity2l1,incApp,incRun,
                                       concCan,recAid,goodUS,
                                       timeInc2,rgdppcGrow,pollExist))

bart_machine_train = bartMachineCV(Xfull, Yfull)
bart_machine_train

# Create plots to show the impact and importance of different variables
# Partial dependence plot -- importance when other variables are controlled
for(i in bart_machine_train$training_data_features) {
  savelink <- paste("./", 
                    i, ".svg", sep = "")
  svg(savelink)
  pd_plot(bart_machine_train, j=i)
  dev.off()
}

# Investigate variable importance through permutation
for(i in bart_machine_train$training_data_features) {
  savelink <- paste("./", 
                    i, "Impt.svg", sep = "")
  svg(savelink)
  cov_importance_test(bart_machine_train, covariates = c(i))
  dev.off()
}

# Investigate overall model importance through permutation
svg("./modelImpt.svg")
cov_importance_test(bart_machine_train)
dev.off()

destroy_bart_machine(bart_machine_train)


# Now that these steps are complete, re-run with the missing data correction
set.seed(7)
Yfull = as.factor(train.nelda$incosWin)
Xfull = subset(train.nelda, select = c(polity2l1,incApp,incRun,
                                        concCan,recAid,goodUS,
                                        timeInc2,rgdppcGrow,pollExist))

bart_machine_train = bartMachineCV(Xfull, Yfull,use_missing_data = TRUE, 
                                   use_missing_data_dummies_as_covars = TRUE)
bart_machine_train
# Results
# As noted in SI, runs are not completely consistent. 
# These results are representative with accuracy at plus or minus 2% in repeated runs.
# confusion matrix:
#   
#   predicted 1 predicted 2 model errors
# actual 1        251.00      45.000        0.152
# actual 2         59.00     138.000        0.299
# use errors        0.19       0.246        0.211

# Get the probability estimates for bar plots for training data
pred0 <- predict(bart_machine_train, Xfull, type = "prob")
pred0 <- data.frame(cbind(Yfull,pred0))
names(pred0) <- c("actual","probability")
pred0$actual[pred0$actual == 2] <- 0
pred0$actual <- factor(pred0$actual, labels = c("Opposition","Incumbent Party"))
pred0box <- ggplot(pred0,aes(actual,probability)) + geom_boxplot()
pred0box <- pred0box + xlab('Actual Winner') + ylab('Model Probability of Incumbent Party Win')
pred0box <- pred0box + ggtitle('Training Data') + theme_bw()
pred0box

# Get Brier scores for naive and BART model
pred0$actual <- ifelse(pred0$actual == "Incumbent Party",1,0)
incumbentModela <- sum((1-pred0$actual)^2)/length(pred0$actual)
incumbentModela
bartModela <- sum((pred0$probability - pred0$actual)^2)/length(pred0$actual)
bartModela


# Now use this to predict the out-of-sample group.
test.nelda <- subset(test.nelda, select = c("incosWin","polity2l1","incApp","incRun",
                                            "concCan","recAid","goodUS",
                                            "timeInc2","rgdppcGrow","pollExist"))
pred1 <- predict(bart_machine_train, test.nelda[,2:10], type = "prob")
pred2 <- predict(bart_machine_train, test.nelda[,2:10], type = "class")
pvact <- data.frame(cbind(test.nelda$incosWin, pred2, pred1))
names(pvact) <- c("actual", "predicted", "probability")
# True Positives
tpos <- nrow(pvact[pvact$actual == 1 & pvact$predicted == 1,]) #80
# True Negatives
tneg <- nrow(pvact[pvact$actual == 2 & pvact$predicted == 2,]) #24
# False Positives
fpos <- nrow(pvact[pvact$actual == 2 & pvact$predicted == 1,]) #9
# False Negatives
fneg <- nrow(pvact[pvact$actual == 1 & pvact$predicted == 2,]) #15
# Check accuracy
(tpos+tneg)/(tneg+tpos+fneg+fpos)
# 0.8125
# Again, results are representative, with the vast majority falling between plus or minus 1 and all repeated runs (500) within 3%.

# Create boxplot for out-of-sample data
pvact$actual[pvact$actual == 2] <- 0
pvact$actual <- factor(pvact$actual, labels = c("Opposition","Incumbent Party"))
pvactbox <- ggplot(pvact,aes(actual,probability)) + geom_boxplot()
pvactbox <- pvactbox + xlab('Actual Winner') + ylab('Model Probability of Incumbent Party Win')
pvactbox <- pvactbox + ggtitle('Testing Data') + theme_bw()
pvactbox

# Get Brier scores for testing data
pvact$actual <- ifelse(pvact$actual == "Incumbent Party",1,0)
incumbentModelb <- sum((1-pvact$actual)^2)/length(pvact$actual)
incumbentModelb
bartModelb <- sum((pvact$probability - pvact$actual)^2)/length(pvact$actual)
bartModelb

# Combine Boxplots for publication plot
source("./multiplot.r")
multiplot(pred0box,pvactbox,cols=2)

destroy_bart_machine(bart_machine_train)
